# encoding:UTF-8
import MySQLdb,re
import pandas as pd
import numpy as np
from scipy.misc import imread
import matplotlib.pyplot as plt
from snownlp import SnowNLP
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import jieba
import sys
reload(sys)
sys.setdefaultencoding("utf8")

def readmysql(): #读取数据库
    signaturelist = []
    provincelist=[]
    userlist = []
    conn =MySQLdb.connect(host='localhost',user='root',passwd='',charset="utf8")    #连接服务器
    with conn:
        cur = conn.cursor()
        cur.execute("SELECT * FROM weixin.test WHERE id < '%d'" % 1000)
        rows = cur.fetchall()
        for row in rows:
            row = list(row)
            if row not in userlist:
                userlist.append([row[0],row[1],row[2],row[3],row[4],row[5]])
                user_id = row[0]
                nick_name = row[1]
                province = row[3]
                if province:
                   provincelist.append(province)
                city =row[4]
                signature = row[5]
                if signature:
                    p2 = re.compile(ur'[^\u4e00-\u9fa5]')#筛选出中文
                    zh = " ".join(p2.split(signature)).strip()#strip是去掉左右的空格
                    zh = ",".join(zh.split())
                    signaturelist.append(zh)
            # print("%d %s %s %s %s" % (user_id,nick_name,province,city,signature))
    return signaturelist,userlist,provincelist
def count(userlist):
    citylist= []
    for item in userlist:
       if item[3] == '北京':
           if len(item[4]) ==0:
               continue
           else:
               citylist.append(item[4])
    print len(citylist)
    stat ={}
    for li in citylist:
        if  not stat.has_key(li):
            stat[li]=0
        stat[li] +=1
    print stat
    print len(stat)

def count(provincelist):
    characters=[]
    stat={}
    for item in provincelist:
        if len(item) == 0:
            continue
        item = unicode(item)
        # 尚未记录在characters中
        if not item in characters:
            characters.append(item)
        # 尚未记录在stat中
        if not stat.has_key(item):
            stat[item] = 0
        # 汉字出现次数加1
        stat[item] += 1
    # print characters
    print stat

def function(signaturelist):
    for li in signaturelist:
        print li
def wordtocloud(signaturelist):
    fulltext = ''
    isCN = 1
    back_coloring = imread("bg2.jpg")
    cloud = WordCloud(font_path='font.ttf', # 若是有中文的话，这句代码必须添加，不然会出现方框，不出现汉字
            background_color="white",  # 背景颜色
            max_words=500,  # 词云显示的最大词数
            mask=back_coloring,  # 设置背景图片
            max_font_size=100,  # 字体最大值
            random_state=42,
            width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
            )
    for li in signaturelist:
        fulltext += ' '.join(jieba.cut(li,cut_all = False))
    wc = cloud.generate(fulltext)
    image_colors = ImageColorGenerator(back_coloring)
    plt.figure("wordc")
    plt.imshow(wc.recolor(color_func=image_colors))
    # plt.axis("off")
    plt.show()
if __name__=='__main__':
    #运行
    signaturelist, userlist, provincelist = readmysql()
    count(userlist)
    count(provincelist)
    wordtocloud(signaturelist)
    function(signaturelist)